In [1]:
import pandas as pd
import numpy as np
from itertools import *
Outline:¶
- load in data
- summary, count nans
- feature engineering
- (a) datetime cols
- (b) categorical cols
- -> correlation matrix of each angle
- investigate nans
- options
- selected mix of drop cols, ML regression model with xgboost, and drop rows
- solution part 1: drop cols
- solution part 2a: ML regression model with xgboost
- solution part 2b: drop rows for nan cols below accuracy threshold
- Summary
- 91% of observations retained with accuracy of filling missing values of 80% or higher
- final dataset is 131,000 observations
- export dataset
Step 1: load in data¶
In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('mode.chained_assignment',None)
np.set_printoptions(threshold=np.inf)
def open_data(repo, file):
with open("".join([repo, file]), "r", errors="ignore") as r:
file = pd.read_csv(r, engine="python", parse_dates=["Date"])
print(file.info())
return file
repo = r"C:/Users/benno/OneDrive/Python/Dash/Deployment_ready/rainfall_aus/assets/data/"
file = "weatherAUS.csv"
rain = open_data(repo, file)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 145460 entries, 0 to 145459 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 145460 non-null datetime64[ns] 1 Location 145460 non-null object 2 MinTemp 143975 non-null float64 3 MaxTemp 144199 non-null float64 4 Rainfall 142199 non-null float64 5 Evaporation 82670 non-null float64 6 Sunshine 75625 non-null float64 7 WindGustDir 135134 non-null object 8 WindGustSpeed 135197 non-null float64 9 WindDir9am 134894 non-null object 10 WindDir3pm 141232 non-null object 11 WindSpeed9am 143693 non-null float64 12 WindSpeed3pm 142398 non-null float64 13 Humidity9am 142806 non-null float64 14 Humidity3pm 140953 non-null float64 15 Pressure9am 130395 non-null float64 16 Pressure3pm 130432 non-null float64 17 Cloud9am 89572 non-null float64 18 Cloud3pm 86102 non-null float64 19 Temp9am 143693 non-null float64 20 Temp3pm 141851 non-null float64 21 RainToday 142199 non-null object 22 RainTomorrow 142193 non-null object dtypes: datetime64[ns](1), float64(16), object(6) memory usage: 25.5+ MB None
Step 2: summarize data¶
- most cols have missing values
- category of "highly sparse" columns @ 50-60% nonnulls
- category of "90s" columns @ 90-99% nonnulls
In [3]:
import warnings
warnings.filterwarnings("ignore")
def summary_dataframe_small(df):
return pd.DataFrame({'features': df.columns,'dtypes': [df.dtypes[i] for i in range(len(df.columns))],
'nulls':[df[df.columns[i]].isnull().sum() for i in range(len(df.columns))],'nonnuls / 1.0': (df.shape[0] -
df.isnull().sum().values) / df.shape[0],'uniques': [df[j].nunique() for j in df.columns]})
summary_dataframe_small(rain)
Out[3]:
| features | dtypes | nulls | nonnuls / 1.0 | uniques | |
|---|---|---|---|---|---|
| 0 | Date | datetime64[ns] | 0 | 1.000000 | 3436 |
| 1 | Location | object | 0 | 1.000000 | 49 |
| 2 | MinTemp | float64 | 1485 | 0.989791 | 389 |
| 3 | MaxTemp | float64 | 1261 | 0.991331 | 505 |
| 4 | Rainfall | float64 | 3261 | 0.977581 | 681 |
| 5 | Evaporation | float64 | 62790 | 0.568335 | 358 |
| 6 | Sunshine | float64 | 69835 | 0.519902 | 145 |
| 7 | WindGustDir | object | 10326 | 0.929011 | 16 |
| 8 | WindGustSpeed | float64 | 10263 | 0.929445 | 67 |
| 9 | WindDir9am | object | 10566 | 0.927361 | 16 |
| 10 | WindDir3pm | object | 4228 | 0.970934 | 16 |
| 11 | WindSpeed9am | float64 | 1767 | 0.987852 | 43 |
| 12 | WindSpeed3pm | float64 | 3062 | 0.978950 | 44 |
| 13 | Humidity9am | float64 | 2654 | 0.981754 | 101 |
| 14 | Humidity3pm | float64 | 4507 | 0.969016 | 101 |
| 15 | Pressure9am | float64 | 15065 | 0.896432 | 546 |
| 16 | Pressure3pm | float64 | 15028 | 0.896686 | 549 |
| 17 | Cloud9am | float64 | 55888 | 0.615784 | 10 |
| 18 | Cloud3pm | float64 | 59358 | 0.591929 | 10 |
| 19 | Temp9am | float64 | 1767 | 0.987852 | 441 |
| 20 | Temp3pm | float64 | 3609 | 0.975189 | 502 |
| 21 | RainToday | object | 3261 | 0.977581 | 2 |
| 22 | RainTomorrow | object | 3267 | 0.977540 | 2 |
In [4]:
def summary_dataframe_large(df):
summary = pd.DataFrame({
"feature": df.columns,
"dtype" : [str(df[col].dtype) for col in df.columns],
"nulls": [f"{df[col].isnull().sum()}" for col in df.columns],
"nonnulls%": [f"{round(((1 - df[col].isnull().sum() / len(df)) * 100), 2)}%" for col in df.columns],
"STATS": ["|" for col in df.columns],
"min": [df[col].min().round(2) if any(t in str(df[col].dtype) for t in ("float", "int")) else " " for col in df.columns],
"50%": [df[col].median().round(2) if any(t in str(df[col].dtype) for t in ("float", "int")) else " "for col in df.columns],
"max": [df[col].max().round(2) if any(t in str(df[col].dtype) for t in ("float", "int")) else " " for col in df.columns],
"stdev": [df[col].std().round(2) if any(t in str(df[col].dtype) for t in ("float", "int")) else " " for col in df.columns],
"mean": [df[col].mean().round(2) if any(t in str(df[col].dtype) for t in ("float", "int")) else " " for col in df.columns],
"skew": ["" if not any(t in str(df[col].dtype) for t in ("float", "int")) else "left" if df[col].median() > df[col].mean() else "right" if df[col].median() < df[col].mean() else "equal" for col in df.columns],
"cat_mode": [" " if any(t in str(df[col].dtype) for t in ("float", "int")) else df[col].mode()[0] for col in df.columns],
"cat_mode": [" " if any(t in str(df[col].dtype) for t in ("float", "int")) else df[col].mode()[0] for col in df.columns],
"COUNTS": ["|" for col in df.columns],
"nuniques": [df[col].nunique() for col in df.columns],
"vcs-": [df[col].value_counts().reset_index()["count"].min() for col in df.columns],
"vcs+": [df[col].value_counts().reset_index()["count"].max() for col in df.columns],
"RANDOM": ["|" for col in df.columns],
"row[random]" :[df[col].iloc[np.random.randint(len(df))].round(2) if any(t in str(df[col].dtype) for t in ("float", "int")) else df[col].iloc[np.random.randint(len(df))] for col in df.columns],
})
print(df.shape)
return summary
summary = summary_dataframe_large(rain)
summary
(145460, 23)
Out[4]:
| feature | dtype | nulls | nonnulls% | STATS | min | 50% | max | stdev | mean | skew | cat_mode | COUNTS | nuniques | vcs- | vcs+ | RANDOM | row[random] | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Date | datetime64[ns] | 0 | 100.0% | | | 2013-03-01 00:00:00 | | | 3436 | 1 | 49 | | | 2015-10-04 00:00:00 | ||||||
| 1 | Location | object | 0 | 100.0% | | | Canberra | | | 49 | 1578 | 3436 | | | NorahHead | ||||||
| 2 | MinTemp | float64 | 1485 | 98.98% | | | -8.5 | 12.0 | 33.9 | 6.4 | 12.19 | right | | | 389 | 1 | 899 | | | 16.7 | |
| 3 | MaxTemp | float64 | 1261 | 99.13% | | | -4.8 | 22.6 | 48.1 | 7.12 | 23.22 | right | | | 505 | 1 | 885 | | | 21.3 | |
| 4 | Rainfall | float64 | 3261 | 97.76% | | | 0.0 | 0.0 | 371.0 | 8.48 | 2.36 | right | | | 681 | 1 | 91080 | | | 0.0 | |
| 5 | Evaporation | float64 | 62790 | 56.83% | | | 0.0 | 4.8 | 145.0 | 4.19 | 5.47 | right | | | 358 | 1 | 3339 | | | 7.0 | |
| 6 | Sunshine | float64 | 69835 | 51.99% | | | 0.0 | 8.4 | 14.5 | 3.79 | 7.61 | left | | | 145 | 1 | 2359 | | | 3.4 | |
| 7 | WindGustDir | object | 10326 | 92.9% | | | W | | | 16 | 6548 | 9915 | | | NE | ||||||
| 8 | WindGustSpeed | float64 | 10263 | 92.94% | | | 6.0 | 39.0 | 135.0 | 13.61 | 40.04 | right | | | 67 | 1 | 9215 | | | 50.0 | |
| 9 | WindDir9am | object | 10566 | 92.74% | | | N | | | 16 | 7024 | 11758 | | | SW | ||||||
| 10 | WindDir3pm | object | 4228 | 97.09% | | | SE | | | 16 | 6590 | 10838 | | | WNW | ||||||
| 11 | WindSpeed9am | float64 | 1767 | 98.79% | | | 0.0 | 13.0 | 130.0 | 8.92 | 14.04 | right | | | 43 | 1 | 13649 | | | 13.0 | |
| 12 | WindSpeed3pm | float64 | 3062 | 97.89% | | | 0.0 | 19.0 | 87.0 | 8.81 | 18.66 | left | | | 44 | 1 | 12580 | | | 17.0 | |
| 13 | Humidity9am | float64 | 2654 | 98.18% | | | 0.0 | 70.0 | 100.0 | 19.03 | 68.88 | left | | | 101 | 1 | 3391 | | | 79.0 | |
| 14 | Humidity3pm | float64 | 4507 | 96.9% | | | 0.0 | 52.0 | 100.0 | 20.8 | 51.54 | left | | | 101 | 4 | 2751 | | | 65.0 | |
| 15 | Pressure9am | float64 | 15065 | 89.64% | | | 980.5 | 1017.6 | 1041.0 | 7.11 | 1017.65 | right | | | 546 | 1 | 816 | | | 1018.8 | |
| 16 | Pressure3pm | float64 | 15028 | 89.67% | | | 977.1 | 1015.2 | 1039.6 | 7.04 | 1015.26 | right | | | 549 | 1 | 786 | | | NaN | |
| 17 | Cloud9am | float64 | 55888 | 61.58% | | | 0.0 | 5.0 | 9.0 | 2.89 | 4.45 | left | | | 10 | 2 | 19972 | | | NaN | |
| 18 | Cloud3pm | float64 | 59358 | 59.19% | | | 0.0 | 5.0 | 9.0 | 2.72 | 4.51 | left | | | 10 | 1 | 18229 | | | NaN | |
| 19 | Temp9am | float64 | 1767 | 98.79% | | | -7.2 | 16.7 | 40.2 | 6.49 | 16.99 | right | | | 441 | 1 | 912 | | | 13.3 | |
| 20 | Temp3pm | float64 | 3609 | 97.52% | | | -5.4 | 21.1 | 46.7 | 6.94 | 21.68 | right | | | 502 | 1 | 882 | | | 23.2 | |
| 21 | RainToday | object | 3261 | 97.76% | | | No | | | 2 | 31880 | 110319 | | | No | ||||||
| 22 | RainTomorrow | object | 3267 | 97.75% | | | No | | | 2 | 31877 | 110316 | | | Yes |
In [5]:
rain.shape
Out[5]:
(145460, 23)
Step 3a: Feature Engineering, datetime¶
- datetime cols
In [6]:
rain_ml = rain.copy()
In [7]:
from datetime import datetime as dt
for df in [rain_ml]:
df['Datetime'] = df['Date'].apply(lambda x: dt.strptime(str(x), "%Y-%m-%d %H:%M:%S"))
df['Weekday'] = [t.day_name() for t in df.Datetime]
df['Month'] = [t.month_name() for t in df.Datetime]
df['Day'] = [t.day for t in df.Datetime]
df['Hour'] = [t.hour for t in df.Datetime]
df['Minute'] = [t.minute for t in df.Datetime]
df['Date'] = [t.date() for t in df.Datetime]
In [8]:
# drop "datetime" dtype cols as some ml models cannot handle this datatype
rain_ml = rain_ml.drop(["Datetime", "Date"], axis=1)
Correlation, angle (A)¶
- after datetime col creation
- before onehot encoding
In [9]:
import plotly.io as pio
import plotly.express as px
import plotly.io as pio
import plotly.graph_objs as go
from plotly.subplots import make_subplots
def heatmap_allfeatures_pearson(df, height=800, colors="Oryel"):
df = df.select_dtypes(["number"])
corr = df.corr().round(2)
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corr = corr.mask(mask)
g = px.imshow(corr, text_auto=True, color_continuous_scale=colors, zmin=-1, zmax=1)
g.update_layout(
{ "template": "plotly_white",
"title": "heatmap, pearson correlation",
"xaxis":{"showgrid": False,
"zeroline": False},
"yaxis": {"showgrid": False,
"zeroline": False},
"height": height,
}).update_traces(xgap=2,ygap=2)
return g, corr
fig, corr = heatmap_allfeatures_pearson(rain_ml, height=700, colors="RdYlBu_r")
fig
Step 3b: Feature Engineering of categoricals¶
- onehot encoding
In [10]:
def pull_categoricals(df):
return [j for j in df.columns if "object" in str(df[j].dtype)]
categoricals = pull_categoricals(rain_ml)
In [11]:
categoricals
Out[11]:
['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow', 'Weekday', 'Month']
In [12]:
def pull_continuous(df, categoricals):
return [j for j in df.columns if j not in categoricals]
continuous = pull_continuous(rain_ml, categoricals)
In [13]:
from sklearn.preprocessing import OneHotEncoder
def onehot_final(df, categoricals):
return pd.DataFrame(OneHotEncoder().fit_transform(df[categoricals].astype(str)).toarray(),
columns = [i for j in [[str(j) + '_' + str(i) for i in np.unique(df[j].astype(str).values)] for j in categoricals] for i in j]
).reset_index().rename(columns={"index": "id"}).merge(df.drop(categoricals, axis=1).reset_index().rename(columns={"index": "id"})).drop(["id"], axis=1)
rain_ml = onehot_final(rain_ml, categoricals)
Correlation, angle (B)¶
- onehot encoded columns
- zoom in to see data vis
In [53]:
fig, corr = heatmap_allfeatures_pearson(rain_ml, height=700, colors="RdYlBu_r")
fig
Step 4: investigate NaNs¶
In [15]:
summary_dataframe_small(rain_ml)
Out[15]:
| features | dtypes | nulls | nonnuls / 1.0 | uniques | |
|---|---|---|---|---|---|
| 0 | Location_Adelaide | float64 | 0 | 1.000000 | 2 |
| 1 | Location_Albany | float64 | 0 | 1.000000 | 2 |
| 2 | Location_Albury | float64 | 0 | 1.000000 | 2 |
| 3 | Location_AliceSprings | float64 | 0 | 1.000000 | 2 |
| 4 | Location_BadgerysCreek | float64 | 0 | 1.000000 | 2 |
| 5 | Location_Ballarat | float64 | 0 | 1.000000 | 2 |
| 6 | Location_Bendigo | float64 | 0 | 1.000000 | 2 |
| 7 | Location_Brisbane | float64 | 0 | 1.000000 | 2 |
| 8 | Location_Cairns | float64 | 0 | 1.000000 | 2 |
| 9 | Location_Canberra | float64 | 0 | 1.000000 | 2 |
| 10 | Location_Cobar | float64 | 0 | 1.000000 | 2 |
| 11 | Location_CoffsHarbour | float64 | 0 | 1.000000 | 2 |
| 12 | Location_Dartmoor | float64 | 0 | 1.000000 | 2 |
| 13 | Location_Darwin | float64 | 0 | 1.000000 | 2 |
| 14 | Location_GoldCoast | float64 | 0 | 1.000000 | 2 |
| 15 | Location_Hobart | float64 | 0 | 1.000000 | 2 |
| 16 | Location_Katherine | float64 | 0 | 1.000000 | 2 |
| 17 | Location_Launceston | float64 | 0 | 1.000000 | 2 |
| 18 | Location_Melbourne | float64 | 0 | 1.000000 | 2 |
| 19 | Location_MelbourneAirport | float64 | 0 | 1.000000 | 2 |
| 20 | Location_Mildura | float64 | 0 | 1.000000 | 2 |
| 21 | Location_Moree | float64 | 0 | 1.000000 | 2 |
| 22 | Location_MountGambier | float64 | 0 | 1.000000 | 2 |
| 23 | Location_MountGinini | float64 | 0 | 1.000000 | 2 |
| 24 | Location_Newcastle | float64 | 0 | 1.000000 | 2 |
| 25 | Location_Nhil | float64 | 0 | 1.000000 | 2 |
| 26 | Location_NorahHead | float64 | 0 | 1.000000 | 2 |
| 27 | Location_NorfolkIsland | float64 | 0 | 1.000000 | 2 |
| 28 | Location_Nuriootpa | float64 | 0 | 1.000000 | 2 |
| 29 | Location_PearceRAAF | float64 | 0 | 1.000000 | 2 |
| 30 | Location_Penrith | float64 | 0 | 1.000000 | 2 |
| 31 | Location_Perth | float64 | 0 | 1.000000 | 2 |
| 32 | Location_PerthAirport | float64 | 0 | 1.000000 | 2 |
| 33 | Location_Portland | float64 | 0 | 1.000000 | 2 |
| 34 | Location_Richmond | float64 | 0 | 1.000000 | 2 |
| 35 | Location_Sale | float64 | 0 | 1.000000 | 2 |
| 36 | Location_SalmonGums | float64 | 0 | 1.000000 | 2 |
| 37 | Location_Sydney | float64 | 0 | 1.000000 | 2 |
| 38 | Location_SydneyAirport | float64 | 0 | 1.000000 | 2 |
| 39 | Location_Townsville | float64 | 0 | 1.000000 | 2 |
| 40 | Location_Tuggeranong | float64 | 0 | 1.000000 | 2 |
| 41 | Location_Uluru | float64 | 0 | 1.000000 | 2 |
| 42 | Location_WaggaWagga | float64 | 0 | 1.000000 | 2 |
| 43 | Location_Walpole | float64 | 0 | 1.000000 | 2 |
| 44 | Location_Watsonia | float64 | 0 | 1.000000 | 2 |
| 45 | Location_Williamtown | float64 | 0 | 1.000000 | 2 |
| 46 | Location_Witchcliffe | float64 | 0 | 1.000000 | 2 |
| 47 | Location_Wollongong | float64 | 0 | 1.000000 | 2 |
| 48 | Location_Woomera | float64 | 0 | 1.000000 | 2 |
| 49 | WindGustDir_E | float64 | 0 | 1.000000 | 2 |
| 50 | WindGustDir_ENE | float64 | 0 | 1.000000 | 2 |
| 51 | WindGustDir_ESE | float64 | 0 | 1.000000 | 2 |
| 52 | WindGustDir_N | float64 | 0 | 1.000000 | 2 |
| 53 | WindGustDir_NE | float64 | 0 | 1.000000 | 2 |
| 54 | WindGustDir_NNE | float64 | 0 | 1.000000 | 2 |
| 55 | WindGustDir_NNW | float64 | 0 | 1.000000 | 2 |
| 56 | WindGustDir_NW | float64 | 0 | 1.000000 | 2 |
| 57 | WindGustDir_S | float64 | 0 | 1.000000 | 2 |
| 58 | WindGustDir_SE | float64 | 0 | 1.000000 | 2 |
| 59 | WindGustDir_SSE | float64 | 0 | 1.000000 | 2 |
| 60 | WindGustDir_SSW | float64 | 0 | 1.000000 | 2 |
| 61 | WindGustDir_SW | float64 | 0 | 1.000000 | 2 |
| 62 | WindGustDir_W | float64 | 0 | 1.000000 | 2 |
| 63 | WindGustDir_WNW | float64 | 0 | 1.000000 | 2 |
| 64 | WindGustDir_WSW | float64 | 0 | 1.000000 | 2 |
| 65 | WindGustDir_nan | float64 | 0 | 1.000000 | 2 |
| 66 | WindDir9am_E | float64 | 0 | 1.000000 | 2 |
| 67 | WindDir9am_ENE | float64 | 0 | 1.000000 | 2 |
| 68 | WindDir9am_ESE | float64 | 0 | 1.000000 | 2 |
| 69 | WindDir9am_N | float64 | 0 | 1.000000 | 2 |
| 70 | WindDir9am_NE | float64 | 0 | 1.000000 | 2 |
| 71 | WindDir9am_NNE | float64 | 0 | 1.000000 | 2 |
| 72 | WindDir9am_NNW | float64 | 0 | 1.000000 | 2 |
| 73 | WindDir9am_NW | float64 | 0 | 1.000000 | 2 |
| 74 | WindDir9am_S | float64 | 0 | 1.000000 | 2 |
| 75 | WindDir9am_SE | float64 | 0 | 1.000000 | 2 |
| 76 | WindDir9am_SSE | float64 | 0 | 1.000000 | 2 |
| 77 | WindDir9am_SSW | float64 | 0 | 1.000000 | 2 |
| 78 | WindDir9am_SW | float64 | 0 | 1.000000 | 2 |
| 79 | WindDir9am_W | float64 | 0 | 1.000000 | 2 |
| 80 | WindDir9am_WNW | float64 | 0 | 1.000000 | 2 |
| 81 | WindDir9am_WSW | float64 | 0 | 1.000000 | 2 |
| 82 | WindDir9am_nan | float64 | 0 | 1.000000 | 2 |
| 83 | WindDir3pm_E | float64 | 0 | 1.000000 | 2 |
| 84 | WindDir3pm_ENE | float64 | 0 | 1.000000 | 2 |
| 85 | WindDir3pm_ESE | float64 | 0 | 1.000000 | 2 |
| 86 | WindDir3pm_N | float64 | 0 | 1.000000 | 2 |
| 87 | WindDir3pm_NE | float64 | 0 | 1.000000 | 2 |
| 88 | WindDir3pm_NNE | float64 | 0 | 1.000000 | 2 |
| 89 | WindDir3pm_NNW | float64 | 0 | 1.000000 | 2 |
| 90 | WindDir3pm_NW | float64 | 0 | 1.000000 | 2 |
| 91 | WindDir3pm_S | float64 | 0 | 1.000000 | 2 |
| 92 | WindDir3pm_SE | float64 | 0 | 1.000000 | 2 |
| 93 | WindDir3pm_SSE | float64 | 0 | 1.000000 | 2 |
| 94 | WindDir3pm_SSW | float64 | 0 | 1.000000 | 2 |
| 95 | WindDir3pm_SW | float64 | 0 | 1.000000 | 2 |
| 96 | WindDir3pm_W | float64 | 0 | 1.000000 | 2 |
| 97 | WindDir3pm_WNW | float64 | 0 | 1.000000 | 2 |
| 98 | WindDir3pm_WSW | float64 | 0 | 1.000000 | 2 |
| 99 | WindDir3pm_nan | float64 | 0 | 1.000000 | 2 |
| 100 | RainToday_No | float64 | 0 | 1.000000 | 2 |
| 101 | RainToday_Yes | float64 | 0 | 1.000000 | 2 |
| 102 | RainToday_nan | float64 | 0 | 1.000000 | 2 |
| 103 | RainTomorrow_No | float64 | 0 | 1.000000 | 2 |
| 104 | RainTomorrow_Yes | float64 | 0 | 1.000000 | 2 |
| 105 | RainTomorrow_nan | float64 | 0 | 1.000000 | 2 |
| 106 | Weekday_Friday | float64 | 0 | 1.000000 | 2 |
| 107 | Weekday_Monday | float64 | 0 | 1.000000 | 2 |
| 108 | Weekday_Saturday | float64 | 0 | 1.000000 | 2 |
| 109 | Weekday_Sunday | float64 | 0 | 1.000000 | 2 |
| 110 | Weekday_Thursday | float64 | 0 | 1.000000 | 2 |
| 111 | Weekday_Tuesday | float64 | 0 | 1.000000 | 2 |
| 112 | Weekday_Wednesday | float64 | 0 | 1.000000 | 2 |
| 113 | Month_April | float64 | 0 | 1.000000 | 2 |
| 114 | Month_August | float64 | 0 | 1.000000 | 2 |
| 115 | Month_December | float64 | 0 | 1.000000 | 2 |
| 116 | Month_February | float64 | 0 | 1.000000 | 2 |
| 117 | Month_January | float64 | 0 | 1.000000 | 2 |
| 118 | Month_July | float64 | 0 | 1.000000 | 2 |
| 119 | Month_June | float64 | 0 | 1.000000 | 2 |
| 120 | Month_March | float64 | 0 | 1.000000 | 2 |
| 121 | Month_May | float64 | 0 | 1.000000 | 2 |
| 122 | Month_November | float64 | 0 | 1.000000 | 2 |
| 123 | Month_October | float64 | 0 | 1.000000 | 2 |
| 124 | Month_September | float64 | 0 | 1.000000 | 2 |
| 125 | MinTemp | float64 | 1485 | 0.989791 | 389 |
| 126 | MaxTemp | float64 | 1261 | 0.991331 | 505 |
| 127 | Rainfall | float64 | 3261 | 0.977581 | 681 |
| 128 | Evaporation | float64 | 62790 | 0.568335 | 358 |
| 129 | Sunshine | float64 | 69835 | 0.519902 | 145 |
| 130 | WindGustSpeed | float64 | 10263 | 0.929445 | 67 |
| 131 | WindSpeed9am | float64 | 1767 | 0.987852 | 43 |
| 132 | WindSpeed3pm | float64 | 3062 | 0.978950 | 44 |
| 133 | Humidity9am | float64 | 2654 | 0.981754 | 101 |
| 134 | Humidity3pm | float64 | 4507 | 0.969016 | 101 |
| 135 | Pressure9am | float64 | 15065 | 0.896432 | 546 |
| 136 | Pressure3pm | float64 | 15028 | 0.896686 | 549 |
| 137 | Cloud9am | float64 | 55888 | 0.615784 | 10 |
| 138 | Cloud3pm | float64 | 59358 | 0.591929 | 10 |
| 139 | Temp9am | float64 | 1767 | 0.987852 | 441 |
| 140 | Temp3pm | float64 | 3609 | 0.975189 | 502 |
| 141 | Day | int64 | 0 | 1.000000 | 31 |
| 142 | Hour | int64 | 0 | 1.000000 | 1 |
| 143 | Minute | int64 | 0 | 1.000000 | 1 |
In [16]:
rain_ml.columns[-19:]
Out[16]:
Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
'Temp9am', 'Temp3pm', 'Day', 'Hour', 'Minute'],
dtype='object')
In [17]:
nancols = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
'Temp9am', 'Temp3pm',]
In [18]:
nan1_mk = ( rain_ml["MinTemp"].isnull() )
s0_mk = ( rain_ml["Sunshine"] == 0 )
c0_mk = (rain_ml["Cloud9am"] == 0 )
# rain_ml[nan_mk][:50]
# rain_ml[s0_mk].iloc[:50, -19:]
# rain_ml[c0_mk].iloc[:50, -19:]
Step 4: Findings¶
data appears to be MCAR, with the missing data not forming a pattern
Step 5: Options to deal with missing values¶
Problem 1: 50-60% missing value columns¶
- option 0: gather new data
- option 1: drop rows (keep about 35% of the data)
- option 2: drop columns (keep all observation-wise data, don't consider cols with nulls)
Problem 2: 90-99% missing value columns¶
- option 0: gather new data
- option 1: drop rows (keep about 80% of the data)
- option 2: drop columns (this would be the majority of the data deleted)
- option 3: interpolation with ML model (guess the values using the data at hand; 83-98% accuracy)
Solutions¶
- problem 1:
- option 2
- problem 2:
- option 3, then option 1
Step 6: Solution 1), Drop Cols¶
drop class of feature with 50-60% nonnulls
In [19]:
dropcols = ["Evaporation", "Sunshine", "Cloud9am", "Cloud3pm"]
rain_ml = rain_ml.drop(dropcols, axis=1)
Step 7: Solution 2.i) ML Regression Model¶
- -> 1x: test performance among all models
- -> 2x: select model, remove other models, predict, and interpolate
In [54]:
# 1x:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
model_lst = [
LinearRegression(n_jobs=-1),
DecisionTreeRegressor(),
RandomForestRegressor(n_jobs=-1),
GradientBoostingRegressor(),
xgb.XGBRegressor(n_jobs=-1),
]
In [21]:
# 2x:
model_lst = [
xgb.XGBRegressor(n_jobs=-1),
]
In [22]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
def model_selection_kfold_regression(models, df, dep_var):
df = df.dropna().reset_index(drop=True)
data = df.drop([dep_var],axis=1).squeeze()
target = df[[dep_var]].squeeze()
idx = np.random.choice(range(0,len(df)), size=round(len(df)*0.80), replace=False)
x_train = data.iloc[idx].reset_index(drop=True)
y_train = target.iloc[idx].reset_index(drop=True)
x_test = data.iloc[data.index.difference(idx)].reset_index(drop=True)
y_test = target.iloc[data.index.difference(idx)].reset_index(drop=True)
scores_dct = {}
for model in models:
clf = model
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
cv_score = np.mean(cross_val_score(clf, data, target, cv=KFold(n_splits=5, shuffle=True, random_state=42)))
scores_dct[str(model)] = {}
scores_dct[str(model)]["model"] = model
scores_dct[str(model)]["preds"] = preds
scores_dct[str(model)]["scores"] = {
'mean_squared_error':metrics.mean_squared_error(preds, y_test),
'train':clf.score(x_train, y_train),
'test':clf.score(x_test, y_test),
'cv':cv_score
}
return scores_dct
In [23]:
def scores_dct_to_df(dct):
df = pd.DataFrame(
index=["XGBoost" if "base_score" in str(j) else str(j) for j in dct.keys()],
columns=list(np.array(list(dct.values()))[0]["scores"].keys()),
data=[list(j["scores"].values()) for j in list(dct.values())]
)
return df
In [24]:
def interpolate_via_model(df, column, model):
interpolate = pd.DataFrame({
"true": df[column],
"preds": model.predict(df.drop([column], axis=1)),
})
nan_mk = ( interpolate["true"].isnull() )
interpolate["true"][nan_mk] = interpolate["preds"][nan_mk]
return interpolate["true"]
Step 7a) feature "MinTemp"¶
- 1x: sampling of models
In [55]:
model_dct_mintemp = model_selection_kfold_regression(model_lst, rain_ml, "MinTemp")
scores_df = scores_dct_to_df(model_dct_mintemp)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[55]:
| mean_squared_error | train | test | cv | |
|---|---|---|---|---|
| XGBoost | 2.939828 | 0.943602 | 0.929640 | 0.930224 |
| RandomForestRegressor(n_jobs=-1) | 3.406263 | 0.988639 | 0.918477 | 0.919240 |
| LinearRegression(n_jobs=-1) | 3.645535 | 0.914853 | 0.912750 | 0.914249 |
| GradientBoostingRegressor() | 3.941957 | 0.908978 | 0.905656 | 0.907212 |
| DecisionTreeRegressor() | 7.203550 | 1.000000 | 0.827595 | 0.830032 |
In [26]:
model_mintemp = model_dct_mintemp[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["MinTemp"] = interpolate_via_model(rain_ml, "MinTemp", model_mintemp)
Step 7b) feature "MaxTemp"¶
- 2x: after viewing above format for all features, select model that perfoms the best
In [27]:
scores_dct_maxtemp = model_selection_kfold_regression(model_lst, rain_ml, "MaxTemp")
scores_df = scores_dct_to_df(scores_dct_maxtemp)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[27]:
| mean_squared_error | train | test | cv | |
|---|---|---|---|---|
| XGBoost | 0.715151 | 0.989359 | 0.985244 | 0.985408 |
In [28]:
model_maxtemp = scores_dct_maxtemp[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["MaxTemp"] = interpolate_via_model(rain_ml, "MaxTemp", model_maxtemp)
Step 7c) feature "Rainfall"¶
- model performed below threshold of 80%; do not use model to interpoate
In [29]:
scores_dct_rainfall = model_selection_kfold_regression(model_lst, rain_ml, "Rainfall")
scores_df = scores_dct_to_df(scores_dct_rainfall)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[29]:
| mean_squared_error | train | test | cv | |
|---|---|---|---|---|
| XGBoost | 35.722158 | 0.769334 | 0.420896 | 0.422957 |
In [30]:
# model_rainfall = scores_dct_rainfall["XGBRegress"]["model"]
# rain_ml["Rainfall"] = interpolate_via_model(rain_ml, "Rainfall", model_rainfall)
Step 7d) feature "WindSpeed9am"¶
- model performed below threshold of 80%; do not use model to interpoate
In [31]:
# 1:
scores_dct_ws9am = model_selection_kfold_regression(model_lst, rain_ml, "WindSpeed9am")
scores_df = scores_dct_to_df(scores_dct_ws9am)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[31]:
| mean_squared_error | train | test | cv | |
|---|---|---|---|---|
| XGBoost | 24.398231 | 0.757035 | 0.689277 | 0.690469 |
In [32]:
# model_ws9am = scores_dct_ws9am["XGBRegress"]["model"]
# rain_ml["WindSpeed9am"] = interpolate_via_model(rain_ml, "WindSpeed9am", model_ws9am)
Step 7e) feature "WindSpeed3pm"¶
- model performed below threshold of 80%; do not use model to interpoate
In [33]:
scores_dct_ws3pm = model_selection_kfold_regression(model_lst, rain_ml, "WindSpeed3pm")
scores_df = scores_dct_to_df(scores_dct_ws3pm)
scores_df.sort_values(by=["mean_squared_error"], ascending=True)
Out[33]:
| mean_squared_error | train | test | cv | |
|---|---|---|---|---|
| XGBoost | 22.767067 | 0.756524 | 0.698011 | 0.691884 |
In [34]:
# model_ws3pm = scores_dct_ws3pm["XGBRegress"]["model"]
# rain_ml["WindSpeed3pm"] = interpolate_via_model(rain_ml, "WindSpeed3pm", model_ws3pm)
Step 7f) feature "Humidity9am"¶
In [35]:
scores_dct_h9am = model_selection_kfold_regression(model_lst, rain_ml, "Humidity9am")
scores_df = scores_dct_to_df(scores_dct_h9am)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[35]:
| mean_squared_error | train | test | cv | |
|---|---|---|---|---|
| XGBoost | 58.993709 | 0.871371 | 0.838998 | 0.838083 |
In [36]:
model_h9am = scores_dct_h9am[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["Humidity9am"] = interpolate_via_model(rain_ml, "Humidity9am", model_h9am)
Step 7g) feature "Humidity3pm"¶
In [37]:
scores_dct_h3pm = model_selection_kfold_regression(model_lst, rain_ml, "Humidity3pm")
scores_df = scores_dct_to_df(scores_dct_h3pm)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[37]:
| mean_squared_error | train | test | cv | |
|---|---|---|---|---|
| XGBoost | 51.932807 | 0.906245 | 0.879736 | 0.878621 |
In [38]:
model_h3pm = scores_dct_h3pm[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["Humidity3pm"] = interpolate_via_model(rain_ml, "Humidity3pm", model_h3pm)
Step 7h) feature "Pressure9am"¶
In [39]:
scores_dct_p9am = model_selection_kfold_regression(model_lst, rain_ml, "Pressure9am")
scores_df = scores_dct_to_df(scores_dct_p9am)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[39]:
| mean_squared_error | train | test | cv | |
|---|---|---|---|---|
| XGBoost | 1.489629 | 0.977829 | 0.970852 | 0.970344 |
In [40]:
model_p9am = scores_dct_p9am[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["Pressure9am"] = interpolate_via_model(rain_ml, "Pressure9am", model_p9am)
Step 7i) feature "Pressure3pm"¶
In [41]:
# 1
scores_dct_p3pm = model_selection_kfold_regression(model_lst, rain_ml, "Pressure3pm")
scores_df = scores_dct_to_df(scores_dct_p3pm)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[41]:
| mean_squared_error | train | test | cv | |
|---|---|---|---|---|
| XGBoost | 1.490953 | 0.977034 | 0.96956 | 0.969553 |
In [42]:
model_p3pm = scores_dct_p3pm[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["Pressure3pm"] = interpolate_via_model(rain_ml, "Pressure3pm", model_p3pm)
Step 7j) feature "Temp9am"¶
In [43]:
scores_dct_t9am = model_selection_kfold_regression(model_lst, rain_ml, "Temp9am")
scores_df = scores_dct_to_df(scores_dct_t9am)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[43]:
| mean_squared_error | train | test | cv | |
|---|---|---|---|---|
| XGBoost | 1.167374 | 0.978179 | 0.972661 | 0.972582 |
In [44]:
model_t9am = scores_dct_t9am[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["Temp9am"] = interpolate_via_model(rain_ml, "Temp9am", model_t9am)
Step 7j) feature "Temp3pm"¶
In [45]:
scores_dct_t3pm = model_selection_kfold_regression(model_lst, rain_ml, "Temp3pm")
scores_df = scores_dct_to_df(scores_dct_t3pm)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[45]:
| mean_squared_error | train | test | cv | |
|---|---|---|---|---|
| XGBoost | 0.646544 | 0.990824 | 0.98678 | 0.987138 |
In [46]:
model_t3pm = scores_dct_t3pm[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["Temp3pm"] = interpolate_via_model(rain_ml, "Temp3pm", model_t3pm)
Step 7: Summary¶
- 3 features did not perform well enough to use interpolation methods
- 8 features did perform well enough, with scores test && cv >= 80%
Step 8: Solution 2.ii) Drop Rows¶
where 3 above features are nan
In [47]:
rain_ml1 = rain_ml.copy()
In [48]:
rain_ml1 = rain_ml1.dropna()
In [49]:
rain_ml1.shape
Out[49]:
(132181, 140)
Step 9: Summarize¶
- Final removed columns numbers 4
- Final row-wise retention is ~132,000 / ~145,000, or 90.9% retention of observations
- Final shape is 132,181 x 140 with 0 null values
In [50]:
rain_ml1.shape[0] / len(rain_ml)
Out[50]:
0.9087102983638113
Step 10: Export data¶
locally
In [51]:
repo = r"C:/Users/benno/OneDrive/Python/Dash/Deployment_ready/rainfall_aus/assets/data/"
file = "weatherAUS_cleaned.csv"
rain_ml1.to_csv("".join([repo, file]), index=False)
In [52]:
summary_dataframe_large(rain_ml1)
(132181, 140)
Out[52]:
| feature | dtype | nulls | nonnulls% | STATS | min | 50% | max | stdev | mean | skew | cat_mode | COUNTS | nuniques | vcs- | vcs+ | RANDOM | row[random] | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Location_Adelaide | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 3064 | 129117 | | | 0.0 | |
| 1 | Location_Albany | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | equal | | | 1 | 132181 | 132181 | | | 0.0 | |
| 2 | Location_Albury | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2984 | 129197 | | | 0.0 | |
| 3 | Location_AliceSprings | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2978 | 129203 | | | 0.0 | |
| 4 | Location_BadgerysCreek | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2864 | 129317 | | | 0.0 | |
| 5 | Location_Ballarat | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2992 | 129189 | | | 0.0 | |
| 6 | Location_Bendigo | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2993 | 129188 | | | 0.0 | |
| 7 | Location_Brisbane | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 3122 | 129059 | | | 0.0 | |
| 8 | Location_Cairns | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2969 | 129212 | | | 0.0 | |
| 9 | Location_Canberra | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 3073 | 129108 | | | 0.0 | |
| 10 | Location_Cobar | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2944 | 129237 | | | 0.0 | |
| 11 | Location_CoffsHarbour | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.14 | 0.02 | right | | | 2 | 2632 | 129549 | | | 0.0 | |
| 12 | Location_Dartmoor | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2934 | 129247 | | | 0.0 | |
| 13 | Location_Darwin | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 3148 | 129033 | | | 0.0 | |
| 14 | Location_GoldCoast | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2885 | 129296 | | | 0.0 | |
| 15 | Location_Hobart | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 3164 | 129017 | | | 0.0 | |
| 16 | Location_Katherine | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.11 | 0.01 | right | | | 2 | 1529 | 130652 | | | 0.0 | |
| 17 | Location_Launceston | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2991 | 129190 | | | 0.0 | |
| 18 | Location_Melbourne | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.13 | 0.02 | right | | | 2 | 2419 | 129762 | | | 0.0 | |
| 19 | Location_MelbourneAirport | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2993 | 129188 | | | 0.0 | |
| 20 | Location_Mildura | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2999 | 129182 | | | 0.0 | |
| 21 | Location_Moree | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.14 | 0.02 | right | | | 2 | 2776 | 129405 | | | 0.0 | |
| 22 | Location_MountGambier | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2979 | 129202 | | | 1.0 | |
| 23 | Location_MountGinini | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.14 | 0.02 | right | | | 2 | 2563 | 129618 | | | 0.0 | |
| 24 | Location_Newcastle | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | equal | | | 1 | 132181 | 132181 | | | 0.0 | |
| 25 | Location_Nhil | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.11 | 0.01 | right | | | 2 | 1563 | 130618 | | | 0.0 | |
| 26 | Location_NorahHead | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2883 | 129298 | | | 0.0 | |
| 27 | Location_NorfolkIsland | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2913 | 129268 | | | 0.0 | |
| 28 | Location_Nuriootpa | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2941 | 129240 | | | 0.0 | |
| 29 | Location_PearceRAAF | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.14 | 0.02 | right | | | 2 | 2628 | 129553 | | | 0.0 | |
| 30 | Location_Penrith | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2910 | 129271 | | | 0.0 | |
| 31 | Location_Perth | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 3188 | 128993 | | | 0.0 | |
| 32 | Location_PerthAirport | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2961 | 129220 | | | 0.0 | |
| 33 | Location_Portland | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2960 | 129221 | | | 0.0 | |
| 34 | Location_Richmond | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2886 | 129295 | | | 0.0 | |
| 35 | Location_Sale | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2858 | 129323 | | | 0.0 | |
| 36 | Location_SalmonGums | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2903 | 129278 | | | 0.0 | |
| 37 | Location_Sydney | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.13 | 0.02 | right | | | 2 | 2298 | 129883 | | | 0.0 | |
| 38 | Location_SydneyAirport | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2953 | 129228 | | | 0.0 | |
| 39 | Location_Townsville | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 3008 | 129173 | | | 0.0 | |
| 40 | Location_Tuggeranong | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2943 | 129238 | | | 0.0 | |
| 41 | Location_Uluru | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.11 | 0.01 | right | | | 2 | 1483 | 130698 | | | 0.0 | |
| 42 | Location_WaggaWagga | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2948 | 129233 | | | 0.0 | |
| 43 | Location_Walpole | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.14 | 0.02 | right | | | 2 | 2769 | 129412 | | | 0.0 | |
| 44 | Location_Watsonia | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2964 | 129217 | | | 0.0 | |
| 45 | Location_Williamtown | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.13 | 0.02 | right | | | 2 | 2432 | 129749 | | | 0.0 | |
| 46 | Location_Witchcliffe | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2919 | 129262 | | | 0.0 | |
| 47 | Location_Wollongong | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2928 | 129253 | | | 0.0 | |
| 48 | Location_Woomera | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.15 | 0.02 | right | | | 2 | 2947 | 129234 | | | 0.0 | |
| 49 | WindGustDir_E | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.25 | 0.07 | right | | | 2 | 9022 | 123159 | | | 0.0 | |
| 50 | WindGustDir_ENE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.24 | 0.06 | right | | | 2 | 7956 | 124225 | | | 1.0 | |
| 51 | WindGustDir_ESE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.23 | 0.05 | right | | | 2 | 7262 | 124919 | | | 0.0 | |
| 52 | WindGustDir_N | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.25 | 0.07 | right | | | 2 | 9009 | 123172 | | | 0.0 | |
| 53 | WindGustDir_NE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.22 | 0.05 | right | | | 2 | 7017 | 125164 | | | 0.0 | |
| 54 | WindGustDir_NNE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.21 | 0.05 | right | | | 2 | 6407 | 125774 | | | 0.0 | |
| 55 | WindGustDir_NNW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.22 | 0.05 | right | | | 2 | 6513 | 125668 | | | 0.0 | |
| 56 | WindGustDir_NW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.24 | 0.06 | right | | | 2 | 7971 | 124210 | | | 0.0 | |
| 57 | WindGustDir_S | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.25 | 0.07 | right | | | 2 | 8889 | 123292 | | | 0.0 | |
| 58 | WindGustDir_SE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.26 | 0.07 | right | | | 2 | 9268 | 122913 | | | 0.0 | |
| 59 | WindGustDir_SSE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.25 | 0.07 | right | | | 2 | 8947 | 123234 | | | 1.0 | |
| 60 | WindGustDir_SSW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.25 | 0.06 | right | | | 2 | 8557 | 123624 | | | 0.0 | |
| 61 | WindGustDir_SW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.25 | 0.07 | right | | | 2 | 8754 | 123427 | | | 0.0 | |
| 62 | WindGustDir_W | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.26 | 0.07 | right | | | 2 | 9705 | 122476 | | | 0.0 | |
| 63 | WindGustDir_WNW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.24 | 0.06 | right | | | 2 | 8020 | 124161 | | | 0.0 | |
| 64 | WindGustDir_WSW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.25 | 0.07 | right | | | 2 | 8845 | 123336 | | | 0.0 | |
| 65 | WindGustDir_nan | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.02 | 0.00 | right | | | 2 | 39 | 132142 | | | 0.0 | |
| 66 | WindDir9am_E | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.25 | 0.07 | right | | | 2 | 8660 | 123521 | | | 0.0 | |
| 67 | WindDir9am_ENE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.23 | 0.06 | right | | | 2 | 7480 | 124701 | | | 0.0 | |
| 68 | WindDir9am_ESE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.23 | 0.06 | right | | | 2 | 7291 | 124890 | | | 0.0 | |
| 69 | WindDir9am_N | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.28 | 0.08 | right | | | 2 | 10901 | 121280 | | | 0.0 | |
| 70 | WindDir9am_NE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.22 | 0.05 | right | | | 2 | 7033 | 125148 | | | 0.0 | |
| 71 | WindDir9am_NNE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.23 | 0.06 | right | | | 2 | 7682 | 124499 | | | 0.0 | |
| 72 | WindDir9am_NNW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.23 | 0.06 | right | | | 2 | 7450 | 124731 | | | 0.0 | |
| 73 | WindDir9am_NW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.23 | 0.06 | right | | | 2 | 7503 | 124678 | | | 0.0 | |
| 74 | WindDir9am_S | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.24 | 0.06 | right | | | 2 | 8121 | 124060 | | | 0.0 | |
| 75 | WindDir9am_SE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.25 | 0.06 | right | | | 2 | 8525 | 123656 | | | 0.0 | |
| 76 | WindDir9am_SSE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.25 | 0.07 | right | | | 2 | 8722 | 123459 | | | 0.0 | |
| 77 | WindDir9am_SSW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.23 | 0.05 | right | | | 2 | 7123 | 125058 | | | 0.0 | |
| 78 | WindDir9am_SW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.23 | 0.06 | right | | | 2 | 7684 | 124497 | | | 0.0 | |
| 79 | WindDir9am_W | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.23 | 0.06 | right | | | 2 | 7496 | 124685 | | | 0.0 | |
| 80 | WindDir9am_WNW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.22 | 0.05 | right | | | 2 | 6817 | 125364 | | | 0.0 | |
| 81 | WindDir9am_WSW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.22 | 0.05 | right | | | 2 | 6507 | 125674 | | | 0.0 | |
| 82 | WindDir9am_nan | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.23 | 0.05 | right | | | 2 | 7186 | 124995 | | | 0.0 | |
| 83 | WindDir3pm_E | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.24 | 0.06 | right | | | 2 | 7759 | 124422 | | | 0.0 | |
| 84 | WindDir3pm_ENE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.23 | 0.06 | right | | | 2 | 7471 | 124710 | | | 0.0 | |
| 85 | WindDir3pm_ESE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.24 | 0.06 | right | | | 2 | 7847 | 124334 | | | 0.0 | |
| 86 | WindDir3pm_N | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.24 | 0.06 | right | | | 2 | 8427 | 123754 | | | 1.0 | |
| 87 | WindDir3pm_NE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.23 | 0.06 | right | | | 2 | 7719 | 124462 | | | 0.0 | |
| 88 | WindDir3pm_NNE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.21 | 0.05 | right | | | 2 | 6318 | 125863 | | | 0.0 | |
| 89 | WindDir3pm_NNW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.23 | 0.06 | right | | | 2 | 7477 | 124704 | | | 0.0 | |
| 90 | WindDir3pm_NW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.24 | 0.06 | right | | | 2 | 7838 | 124343 | | | 0.0 | |
| 91 | WindDir3pm_S | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.25 | 0.07 | right | | | 2 | 9209 | 122972 | | | 0.0 | |
| 92 | WindDir3pm_SE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.26 | 0.07 | right | | | 2 | 9391 | 122790 | | | 0.0 | |
| 93 | WindDir3pm_SSE | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.25 | 0.07 | right | | | 2 | 8844 | 123337 | | | 0.0 | |
| 94 | WindDir3pm_SSW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.23 | 0.06 | right | | | 2 | 7728 | 124453 | | | 1.0 | |
| 95 | WindDir3pm_SW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.25 | 0.07 | right | | | 2 | 8680 | 123501 | | | 1.0 | |
| 96 | WindDir3pm_W | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.26 | 0.07 | right | | | 2 | 9498 | 122683 | | | 0.0 | |
| 97 | WindDir3pm_WNW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.24 | 0.06 | right | | | 2 | 8413 | 123768 | | | 0.0 | |
| 98 | WindDir3pm_WSW | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.25 | 0.07 | right | | | 2 | 8861 | 123320 | | | 0.0 | |
| 99 | WindDir3pm_nan | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.07 | 0.01 | right | | | 2 | 701 | 131480 | | | 0.0 | |
| 100 | RainToday_No | float64 | 0 | 100.0% | | | 0.00 | 1.0 | 1.00 | 0.42 | 0.78 | left | | | 2 | 29267 | 102914 | | | 1.0 | |
| 101 | RainToday_Yes | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.42 | 0.22 | right | | | 2 | 29267 | 102914 | | | 1.0 | |
| 102 | RainToday_nan | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | equal | | | 1 | 132181 | 132181 | | | 0.0 | |
| 103 | RainTomorrow_No | float64 | 0 | 100.0% | | | 0.00 | 1.0 | 1.00 | 0.42 | 0.78 | left | | | 2 | 29575 | 102606 | | | 1.0 | |
| 104 | RainTomorrow_Yes | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.41 | 0.22 | right | | | 2 | 28730 | 103451 | | | 0.0 | |
| 105 | RainTomorrow_nan | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.08 | 0.01 | right | | | 2 | 845 | 131336 | | | 0.0 | |
| 106 | Weekday_Friday | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.35 | 0.14 | right | | | 2 | 18810 | 113371 | | | 0.0 | |
| 107 | Weekday_Monday | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.35 | 0.14 | right | | | 2 | 19045 | 113136 | | | 0.0 | |
| 108 | Weekday_Saturday | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.35 | 0.14 | right | | | 2 | 18557 | 113624 | | | 1.0 | |
| 109 | Weekday_Sunday | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.35 | 0.14 | right | | | 2 | 18779 | 113402 | | | 0.0 | |
| 110 | Weekday_Thursday | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.35 | 0.14 | right | | | 2 | 18972 | 113209 | | | 0.0 | |
| 111 | Weekday_Tuesday | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.35 | 0.14 | right | | | 2 | 19025 | 113156 | | | 0.0 | |
| 112 | Weekday_Wednesday | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.35 | 0.14 | right | | | 2 | 18993 | 113188 | | | 1.0 | |
| 113 | Month_April | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.27 | 0.08 | right | | | 2 | 10565 | 121616 | | | 0.0 | |
| 114 | Month_August | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.28 | 0.08 | right | | | 2 | 10912 | 121269 | | | 0.0 | |
| 115 | Month_December | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.26 | 0.08 | right | | | 2 | 10023 | 122158 | | | 0.0 | |
| 116 | Month_February | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.26 | 0.07 | right | | | 2 | 9743 | 122438 | | | 0.0 | |
| 117 | Month_January | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.29 | 0.09 | right | | | 2 | 11979 | 120202 | | | 0.0 | |
| 118 | Month_July | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.28 | 0.08 | right | | | 2 | 10968 | 121213 | | | 0.0 | |
| 119 | Month_June | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.28 | 0.09 | right | | | 2 | 11542 | 120639 | | | 0.0 | |
| 120 | Month_March | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.29 | 0.09 | right | | | 2 | 12134 | 120047 | | | 0.0 | |
| 121 | Month_May | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.29 | 0.09 | right | | | 2 | 12168 | 120013 | | | 0.0 | |
| 122 | Month_November | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.27 | 0.08 | right | | | 2 | 10671 | 121510 | | | 0.0 | |
| 123 | Month_October | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.28 | 0.08 | right | | | 2 | 11001 | 121180 | | | 0.0 | |
| 124 | Month_September | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 1.00 | 0.27 | 0.08 | right | | | 2 | 10475 | 121706 | | | 0.0 | |
| 125 | MinTemp | float64 | 0 | 100.0% | | | -8.50 | 11.9 | 33.90 | 6.47 | 12.15 | right | | | 489 | 1 | 819 | | | 4.0 | |
| 126 | MaxTemp | float64 | 0 | 100.0% | | | -4.80 | 22.8 | 48.10 | 7.20 | 23.31 | right | | | 570 | 1 | 755 | | | 24.4 | |
| 127 | Rainfall | float64 | 0 | 100.0% | | | 0.00 | 0.0 | 367.60 | 8.40 | 2.33 | right | | | 653 | 1 | 85032 | | | 0.4 | |
| 128 | WindGustSpeed | float64 | 0 | 100.0% | | | 6.00 | 39.0 | 135.00 | 13.58 | 39.98 | right | | | 67 | 1 | 9019 | | | 30.0 | |
| 129 | WindSpeed9am | float64 | 0 | 100.0% | | | 0.00 | 13.0 | 87.00 | 8.77 | 14.19 | right | | | 40 | 1 | 12503 | | | 9.0 | |
| 130 | WindSpeed3pm | float64 | 0 | 100.0% | | | 0.00 | 19.0 | 87.00 | 8.67 | 18.75 | left | | | 42 | 1 | 11937 | | | 17.0 | |
| 131 | Humidity9am | float64 | 0 | 100.0% | | | -20.90 | 70.0 | 112.82 | 19.38 | 68.59 | left | | | 1268 | 1 | 3239 | | | 65.0 | |
| 132 | Humidity3pm | float64 | 0 | 100.0% | | | -14.43 | 51.0 | 104.33 | 20.91 | 50.99 | left | | | 1764 | 1 | 2555 | | | 45.0 | |
| 133 | Pressure9am | float64 | 0 | 100.0% | | | 980.50 | 1018.3 | 1041.00 | 8.37 | 1019.00 | right | | | 9279 | 1 | 773 | | | 1027.2 | |
| 134 | Pressure3pm | float64 | 0 | 100.0% | | | 977.10 | 1015.9 | 1039.60 | 8.25 | 1016.56 | right | | | 9034 | 1 | 741 | | | 1018.9 | |
| 135 | Temp9am | float64 | 0 | 100.0% | | | -7.20 | 16.7 | 40.20 | 6.58 | 16.98 | right | | | 795 | 1 | 811 | | | 10.5 | |
| 136 | Temp3pm | float64 | 0 | 100.0% | | | -5.40 | 21.3 | 46.70 | 7.06 | 21.81 | right | | | 1325 | 1 | 784 | | | 23.1 | |
| 137 | Day | int64 | 0 | 100.0% | | | 1.00 | 16.0 | 31.00 | 8.80 | 15.72 | left | | | 31 | 2579 | 4369 | | | 2.0 | |
| 138 | Hour | int64 | 0 | 100.0% | | | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | equal | | | 1 | 132181 | 132181 | | | 0.0 | |
| 139 | Minute | int64 | 0 | 100.0% | | | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | equal | | | 1 | 132181 | 132181 | | | 0.0 |